Project Setup

# Set seed
set.seed(7)

# Importing data
loanData <- read_csv("loan_sample_10.csv")
## Rows: 40000 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): grade, home_ownership, verification_status, purpose, application_type
## dbl (12): loan_amnt, int_rate, annual_inc, dti, open_acc, revol_bal, revol_u...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- loanData

# Checking structure of the data
head(data)
## # A tibble: 6 × 17
##   loan_amnt int_rate grade home_ownership annual_inc verification_status purpose
##       <dbl>    <dbl> <chr> <chr>               <dbl> <chr>               <chr>  
## 1     20000    11.5  B     MORTGAGE           110000 Source Verified     credit…
## 2     21000     6.97 A     MORTGAGE            95000 Not Verified        debt_c…
## 3     16000    11.0  B     MORTGAGE           104000 Source Verified     debt_c…
## 4      8400     6.92 A     MORTGAGE            48000 Not Verified        debt_c…
## 5     24000    14.5  C     RENT               110000 Not Verified        debt_c…
## 6      9700    18.5  D     OWN                 37000 Verified            debt_c…
## # ℹ 10 more variables: dti <dbl>, open_acc <dbl>, revol_bal <dbl>,
## #   revol_util <dbl>, total_acc <dbl>, total_rec_int <dbl>,
## #   application_type <chr>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## #   Status <dbl>
tail(data)
## # A tibble: 6 × 17
##   loan_amnt int_rate grade home_ownership annual_inc verification_status purpose
##       <dbl>    <dbl> <chr> <chr>               <dbl> <chr>               <chr>  
## 1     13325    10.2  B     MORTGAGE            32240 Not Verified        debt_c…
## 2      6000    11.0  B     RENT                50000 Not Verified        credit…
## 3     16000    12.7  C     OWN                 68000 Source Verified     credit…
## 4     14400    14.5  C     MORTGAGE            61000 Source Verified     debt_c…
## 5      9600    10.4  B     RENT                30000 Not Verified        debt_c…
## 6      9000     9.93 B     RENT                95000 Not Verified        debt_c…
## # ℹ 10 more variables: dti <dbl>, open_acc <dbl>, revol_bal <dbl>,
## #   revol_util <dbl>, total_acc <dbl>, total_rec_int <dbl>,
## #   application_type <chr>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## #   Status <dbl>

The head() and tail() functions show, that there is no unusual observation and the data set looks complete and has no emtpy rows at the end.

# Define colors to create a general style 
custom_fill <- "#105F77"
custom_alpha <- 0.8
custom_color <- "#161616"
custom_light <- "#f4a460"
custom_dark <- "#105F77"

Checking data for NA values

# Check for missing values in each column of the data set and store the result in the "has_NAs" data frame
has_NAs = as.data.frame(apply(data, 2, function(x) any(is.na(x))))

# Map matching column names for a more readable output
colnames(has_NAs) = c("has_NA")

# Print the has_NAs data frame to check if there are any NA values in our data set
has_NAs
##                     has_NA
## loan_amnt            FALSE
## int_rate             FALSE
## grade                FALSE
## home_ownership       FALSE
## annual_inc           FALSE
## verification_status  FALSE
## purpose              FALSE
## dti                  FALSE
## open_acc             FALSE
## revol_bal            FALSE
## revol_util           FALSE
## total_acc            FALSE
## total_rec_int        FALSE
## application_type     FALSE
## tot_cur_bal          FALSE
## total_rev_hi_lim     FALSE
## Status               FALSE

We also found out, that none of the variables has any NA values.

Description of variables

This list describes the variables of the dataset loan_data:

annual_inc: The self-reported annual income provided by the borrower during registration
application_type: Indicates whether the loan is an individual or joint application
dti: Borrower’s total monthly debt payments divided by monthly income
grade: Assigned loan grade by the financial service provider
home_ownership: The home ownership status
int_rate: Interest Rate on the loan
loan_amnt: The listed amount of the loan applied for by the borrower
open_acc: Number of open trades in last 6 months
purpose: A category provided by the borrower for the loan request
revol_bal: Total credit revolving balance
revol_util: Revolving line utilization rate
tot_cur_bal: Total current balance of all accounts
total_acc: The total number of credit lines currently in the borrower’s credit file
total_rec_int: Interest received to date
total_rev_hi_lim: Total revolving high credit/credit limit
verification_status: Indicates if the co-borrowers’ joint income was verified


1 Exercise 1

1.1 Describe the data. Specifically:

1.1.1 Check and report the structure of the data set.

# First, we check the dimension of the dataset
dim(data)
## [1] 40000    17

With the dim function we see, that there are 40’000 observations within 17 variables.

# Next, we display the structure of the dataset, showing the type of each variable (numeric, categorical, etc.).
str(data)
## spc_tbl_ [40,000 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ loan_amnt          : num [1:40000] 20000 21000 16000 8400 24000 9700 9000 5500 10000 12000 ...
##  $ int_rate           : num [1:40000] 11.53 6.97 10.99 6.92 14.49 ...
##  $ grade              : chr [1:40000] "B" "A" "B" "A" ...
##  $ home_ownership     : chr [1:40000] "MORTGAGE" "MORTGAGE" "MORTGAGE" "MORTGAGE" ...
##  $ annual_inc         : num [1:40000] 110000 95000 104000 48000 110000 37000 50000 28000 80000 42500 ...
##  $ verification_status: chr [1:40000] "Source Verified" "Not Verified" "Source Verified" "Not Verified" ...
##  $ purpose            : chr [1:40000] "credit_card" "debt_consolidation" "debt_consolidation" "debt_consolidation" ...
##  $ dti                : num [1:40000] 24.68 11.15 9.66 20.9 18.3 ...
##  $ open_acc           : num [1:40000] 16 9 3 7 10 14 12 7 12 10 ...
##  $ revol_bal          : num [1:40000] 15299 17900 8945 2341 17641 ...
##  $ revol_util         : num [1:40000] 45.5 70.2 65.8 17.3 68.1 55.4 35 24.6 33.4 52.1 ...
##  $ total_acc          : num [1:40000] 32 19 22 18 31 16 19 21 25 27 ...
##  $ total_rec_int      : num [1:40000] 3721 1789 1308 920 3233 ...
##  $ application_type   : chr [1:40000] "Individual" "Individual" "Individual" "Individual" ...
##  $ tot_cur_bal        : num [1:40000] 346890 282222 22696 38145 62426 ...
##  $ total_rev_hi_lim   : num [1:40000] 33600 25500 13600 13500 25900 6900 27100 21500 33500 21800 ...
##  $ Status             : num [1:40000] 0 0 0 0 1 0 0 0 0 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   loan_amnt = col_double(),
##   ..   int_rate = col_double(),
##   ..   grade = col_character(),
##   ..   home_ownership = col_character(),
##   ..   annual_inc = col_double(),
##   ..   verification_status = col_character(),
##   ..   purpose = col_character(),
##   ..   dti = col_double(),
##   ..   open_acc = col_double(),
##   ..   revol_bal = col_double(),
##   ..   revol_util = col_double(),
##   ..   total_acc = col_double(),
##   ..   total_rec_int = col_double(),
##   ..   application_type = col_character(),
##   ..   tot_cur_bal = col_double(),
##   ..   total_rev_hi_lim = col_double(),
##   ..   Status = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

The str function shows us all the different variables and what type they’re from.

1.1.2 How many numeric and how many categorical variables are included in the data? What categorical variable has the most levels in it?

num_vars <- sum(sapply(data, is.numeric))
cat_vars <- sum(sapply(data, is.character))

# Print counts
cat("Number of numeric variables:", num_vars, "\n")
## Number of numeric variables: 12
cat("Number of categorical variables:", cat_vars, "\n")
## Number of categorical variables: 5

Reporting the structure of the data set shows us, that we have 5 character columns and 12 number columns.

1.1.3 Summarize the variables. Discuss the summary statistics obtained.

summary(data)
##    loan_amnt        int_rate        grade           home_ownership    
##  Min.   : 1000   Min.   : 5.31   Length:40000       Length:40000      
##  1st Qu.: 7000   1st Qu.: 9.44   Class :character   Class :character  
##  Median :10000   Median :12.29   Mode  :character   Mode  :character  
##  Mean   :11668   Mean   :12.63                                        
##  3rd Qu.:15075   3rd Qu.:15.05                                        
##  Max.   :40000   Max.   :27.49                                        
##    annual_inc     verification_status   purpose               dti       
##  Min.   :  5000   Length:40000        Length:40000       Min.   : 0.00  
##  1st Qu.: 42000   Class :character    Class :character   1st Qu.:12.12  
##  Median : 57000   Mode  :character    Mode  :character   Median :17.62  
##  Mean   : 63327                                          Mean   :18.22  
##  3rd Qu.: 77000                                          3rd Qu.:23.89  
##  Max.   :400000                                          Max.   :60.14  
##     open_acc      revol_bal       revol_util       total_acc    
##  Min.   : 1.0   Min.   :    0   Min.   :  0.00   Min.   : 3.00  
##  1st Qu.: 8.0   1st Qu.: 5606   1st Qu.: 34.70   1st Qu.:15.00  
##  Median :10.0   Median : 9782   Median : 52.30   Median :20.00  
##  Mean   :10.3   Mean   :11939   Mean   : 52.07   Mean   :21.31  
##  3rd Qu.:13.0   3rd Qu.:15759   3rd Qu.: 69.70   3rd Qu.:27.00  
##  Max.   :23.0   Max.   :78762   Max.   :123.20   Max.   :57.00  
##  total_rec_int    application_type    tot_cur_bal     total_rev_hi_lim
##  Min.   :   0.0   Length:40000       Min.   :     0   Min.   :   300  
##  1st Qu.: 676.6   Class :character   1st Qu.: 25166   1st Qu.: 13000  
##  Median :1341.8   Mode  :character   Median : 53872   Median : 20800  
##  Mean   :1817.5                      Mean   : 99134   Mean   : 24158  
##  3rd Qu.:2431.3                      3rd Qu.:158901   3rd Qu.: 32100  
##  Max.   :8826.0                      Max.   :467467   Max.   :100000  
##      Status      
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.1285  
##  3rd Qu.:0.0000  
##  Max.   :1.0000

While checking the summary, we can observe, that there might be some values with insane outliers. For example annual_inc, revol_bal or tot_cur_bal. These variables all have a small minimum compared to the median and the max value. This could mean that either the data is very wide spread or there are a few outliers that have an impact on the median and the mean.

1.1.4 Check the levels of the target variable by choosing the appropriate visualization. Is the target variable balanced?

ggplot(data, aes(x = factor(Status))) +
  geom_bar(fill = custom_fill, color = custom_color) +
  labs(x = "Loan Status", y = "Count", title = "Distribution of Loan Status")

Checking the levels of the loan status variable, we can see, that the distribution is not balanced. There are much more entries with status = 0 than status = 1.

1.1.5 Check the distribution of the numeric variables in the data set (include different visual representations).

# This code generates a histogram for each numeric variable

# Function to plot histogram for each numeric variable
plot_histograms <- function(data) {
  numeric_vars <- sapply(data, is.numeric)
  data_numeric <- data[, numeric_vars]

  # Loop through each numeric variable
  for(var in names(data_numeric)) {
    print(
      ggplot(data, aes_string(x = var)) +
        geom_histogram(bins = 30, fill = custom_fill, color = custom_color) +
        labs(x = var, y = "Frequency", title = paste("Distribution of", var)) +
        theme_minimal()
    )
  }
}

# Apply the function to the loan dataset
plot_histograms(data)
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.


1.2 Analysing Outliers

# Function to plot box plots for each numeric variable
# The function plot_boxplots identifies all numeric variables in the dataset and creates box plots for each.
plot_boxplots <- function(data) {
  numeric_vars <- sapply(data, is.numeric)
  data_numeric <- data[, numeric_vars]

  # Loop through each numeric variable
  for(var in names(data_numeric)) {
    print(
      ggplot(data, aes(y = var)) +
        geom_boxplot(fill = custom_fill, color = custom_color) +
        labs(y = var, title = paste("Boxplot of", var)) +
        theme_minimal()
    )
  }
}

# Apply the function to the loan dataset
plot_boxplots(data)


1.3 Visualization to display the distribution of the numeric features

To visualize the distribution of numerical features, box plots provide a clear summary of the data distribution and highlight medians, quartiles and outliers. When split by a categorical variable such as loan status, they allow for direct comparison between groups. This is particularly useful for identifying which numerical characteristics might behave differently in the two categories of the target characteristic and provides insight into which variables might be significant predictors.

# Visualizing Numeric Features by Target Feature
# This code generates box plots for each numeric variable, separated by the target feature 'Status'.
# Function to create box plots for each numeric variable, separated by loan status
plot_numeric_by_target <- function(data, target) {
  numeric_vars <- sapply(data, is.numeric)
  data_numeric <- data[, numeric_vars]

  for (var in names(data_numeric)) {
    # Corrected usage of print() to render the plots
    print(
      ggplot(data, aes(x = as.factor(target), y = var, fill = as.factor(target))) +
      geom_boxplot(fill = custom_fill, color = custom_color) +
      labs(title = paste("Distribution of", var, "by Status"), x = "Status", y = var) +
      theme_minimal()
    )
  }
}

# Apply the function to the loan dataset
plot_numeric_by_target(data, data$Status)

Discussion of the visualization
After running the code below, we will get box plots for each numeric variable, broken down by loan status. These plots show the median (middle line in the box) of each group, the dispersion (size of the box), which shows the variability and the ossible outliers (points outside the whiskers).

Based on the generated boxplots for a subset of numeric variables in the loan_data dataset, in this section is our discussion of the findings presented:
Loan Amount (loan_amnt)
The distribution of loan amounts seems relatively similar for both default and non-default groups. However, there’s a slight indication that higher loan amounts might be more prevalent in the default group. This suggests that as the loan amount increases, there might be a slightly higher risk of default.

Interest Rate (int_rate)
The interest rate shows a notable difference between the two groups. Loans that end up in default tend to have higher interest rates. This is evident from the higher median and the spread of the box for the default group. Higher interest rates could be an indicator of increased risk associated with the loan, leading to a higher likelihood of default.

Annual Income (annual_inc)
The annual income of borrowers does not show a significant difference in the medians of the two groups, although the non-default group seems to have a slightly higher income range. This might suggest that while income is a factor, it’s not a strong differentiator between default and non-default cases on its own.

Debt-to-Income Ratio (dti)
The debt-to-income ratio appears slightly higher for the default group. This suggests that a higher DTI ratio might be associated with a greater risk of default. The higher spread in the default group indicates a wider variability in DTI ratios among those who default.

Revolving Balance (revol_bal) The revolving balance shows a relatively similar distribution across both groups, with a slightly higher median in the non-default group. This indicates that revolving balance alone might not be a strong predictor of default.

From this analysis, it seems that the interest rate and debt-to-income ratio are particularly relevant variables in predicting the likelihood of loan default. Higher interest rates and higher DTI ratios are more common in loans that default, suggesting these factors are important in assessing credit risk. While loan amount, annual income, and revolving balance do show some differences, their impact appears to be less pronounced compared to interest rates and DTI ratios.


1.4 Visualization to display the distribution of the categorial features

The code in this section creates bar plots for each categorical variable, showing their association with the loan status (default vs. non-default).

# Function to plot bar plots for each categorical variable
plot_categorical_associations <- function(data) {
  categorical_vars <- sapply(data, is.character) # or is.character, depending on how data is loaded
  data_categorical <- data[, categorical_vars]

  for(var in names(data_categorical)) {

    print(var)
    # Create the plot and assign it to a variable
    plot <- ggplot(data, aes(x = !!sym(var), fill = factor(data$Status))) +
      geom_bar(position = "stack") +
      labs(title = paste("Association between", var, "and Status"), x = var, y = "Count") +
      theme_minimal() +
      scale_fill_brewer(palette = "Set1") +
      guides(fill = guide_legend(title = "Status"))
    
    # Print the plot
    print(plot)
  }
}

# Apply the function to the loan dataset
plot_categorical_associations(data)
## [1] "grade"

## [1] "home_ownership"

## [1] "verification_status"

## [1] "purpose"

## [1] "application_type"

From these plots, we can observe the proportion of default and non-default cases within each category of the categorical variables. The significant differences in the proportion of default vs. non-default cases across the categories of a variable suggest a potential association. For example, if a certain loan purpose or grade shows a noticeably higher proportion of defaults, it may indicate that these variables are relevant in predicting loan defaults.

The following points describe our findings:
Loan Grade (grade):
There is a noticeable trend in loan defaults across different loan grades. Lower grades (typically representing higher risk) show a higher proportion of defaults. This suggests that the loan grade is a significant factor in predicting loan defaults.

Home Ownership (home_ownership)
The distribution of loan status varies across different home ownership categories. However, the variation is not as pronounced as with loan grades. It appears that borrowers with mortgages have a slightly higher count of non-defaults compared to other categories.

Verification Status (verification_status)
The verification status of the borrower also shows an interesting pattern. Loans with verified statuses tend to have a lower proportion of defaults compared to those that are not verified or are source verified. This implies that verification status might play a role in predicting loan defaults.

Purpose of the Loan (purpose)
The purpose of the loan demonstrates varying patterns of defaults. Certain purposes, like debt consolidation, show a higher number of defaults. This variation suggests that the reason for taking out a loan could be an indicator of default risk.

Application Type (application_type)
The application type, whether individual or joint, shows some differences in default rates. Individual applications seem to have a slightly higher proportion of defaults compared to joint applications, indicating that the type of application might be a factor in default risk.

From these observations, it’s evident that certain categorical variables such as loan grade and loan purpose show a more pronounced association with loan status, indicating their potential relevance in predicting loan defaults. Variables like home ownership and application type also show some association but to a lesser degree. The verification status, interestingly, has a noticeable impact, suggesting that the level of scrutiny in the loan approval process can affect the likelihood of a loan defaulting. These insights are valuable for understanding the risk factors associated with loan defaults and can guide further analysis and modeling efforts.


1.5 Visualization of the correlations

The heatmap of the correlation matrix for the numeric features in the loan_data dataset reveals the following insights:

# The code first selects only numeric variables from our dataset. It then calculates the correlation matrix using Pearson correlation. The melt function from the reshape2 package is used to transform the correlation matrix into a long format suitable for ggplot. Finally, a heatmap is plotted using ggplot2 with appropriate color scales to represent the strength and direction of correlations.

# Selecting only the numeric variables from the dataset
numeric_vars <- data[sapply(data, is.numeric)]

# Calculate the correlation matrix
correlation_matrix <- cor(numeric_vars, use = "complete.obs")

# Melt the correlation matrix for visualization
melted_correlation_matrix <- melt(correlation_matrix)

# Plotting the correlation matrix
ggplot(melted_correlation_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_gradient2(low = custom_light, high = custom_dark, mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Pearson\nCorrelation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 12, hjust = 1),
        axis.text.y = element_text(size = 12)) +
  labs(x = '', y = '', title = 'Correlation Matrix of Numeric Features')


1.6 Visualization of the correlations

The scatter plot showing the relationship between the loan amount requested and the annual income of the borrowers reveals the following insights:

# We plot an interactive scatter plot showing the association between the loan amount requested and the annual income of the borrower
# Creating an interactive scatter plot
plot <- plot_ly(data, x = ~annual_inc, y = ~loan_amnt, type = "scatter", mode = "markers",
                marker = list(size = 10, opacity = 0.5, color = custom_fill),
                hoverinfo = "text",
                text = ~paste("Annual Income:", annual_inc, "<br>Loan Amount:", loan_amnt))
                
# Adding layout details
plot <- plot %>% layout(title = "Scatter Plot of Loan Amount vs Annual Income",
                        xaxis = list(title = "Annual Income"),
                        yaxis = list(title = "Loan Amount"))

# Display the plot
plot

Association Between Loan Amount and Annual Income
Positive Correlation: There appears to be a positive correlation between the loan amount and the annual income. As the annual income increases, the loan amounts also tend to increase. This suggests that borrowers with higher incomes are likely to request larger loans.

Variability in Loan Amounts
Despite the positive trend, there is considerable variability in the loan amounts across different income levels. This indicates that factors other than income also play a significant role in determining the loan amount.

Income Range
The plot shows a wide range of incomes among borrowers. Those with lower incomes tend to request smaller loans, while those with higher incomes have a broader range of loan amounts.

Outliers and Spread
There are some outliers, particularly at higher income levels, where borrowers with very high incomes request relatively small loan amounts. Additionally, the spread of loan amounts widens with increasing income, suggesting more variability in the loan amounts requested by higher-income borrowers.

Conclusion and Decision on Variables
High Correlation: Given the positive correlation, both annual_inc (annual income) and loan_amnt (loan amount) are important variables for predicting loan behavior and should be considered in the predictive model.

Keeping Both Variables: Although they are correlated, both variables likely contribute unique information. Annual income is a key factor in a borrower’s ability to repay, while the loan amount is directly related to the loan’s terms and risk. Therefore, it would be beneficial to keep both variables in the model.

In summary, the association between loan amount and annual income is significant and positive, indicating the importance of these variables in the predictive model. Their relationship suggests that while income is a factor in determining loan amounts, other variables also influence the final loan amount requested.


1.7 New balanced data set

In this step we creating a balanced dataset where the two levels of the target variable (‘Status’) are equally represented, and then we visualize this with a bar plot.

# In this last step we build a balanced dataset

# Check the balance of the original dataset
table(data$Status)
## 
##     0     1 
## 34861  5139
# Create a balanced dataset
# First, separate the dataset into two based on the target variable
data_default <- data %>% filter(Status == 1)
data_no_default <- data %>% filter(Status == 0)

# Then, randomly sample from the larger group to match the size of the smaller group
set.seed(7) # for reproducibility
data_no_default_balanced <- data_no_default %>% sample_n(nrow(data_default))

# Combine the two balanced datasets
balanced_data <- rbind(data_default, data_no_default_balanced)

# Create a bar plot of the newly created target variable
ggplot(balanced_data, aes(x = factor(Status))) +
  geom_bar(fill = custom_fill, color = custom_color) +
  labs(title = "Bar Plot of Balanced Target Variable", x = "Loan Status", y = "Count")

# Check the balance of the new dataset
table(balanced_data$Status)
## 
##    0    1 
## 5139 5139

Balancing the Dataset is important for the following different points:

Avoid Bias in Models
When a dataset is imbalanced, especially in a binary classification problem, models can become biased towards the majority class. This can lead to poor generalization performance on the minority class.

Better Performance Metrics
Balanced datasets provide a more realistic evaluation of model performance metrics, especially those sensitive to class imbalance like accuracy, precision, and recall.

Reflect Real-World Scenarios
In some cases, balancing a dataset can help models learn patterns that are not overrepresented by the majority class, thus better reflecting real-world scenarios where the target classes are more evenly distributed.

2 Exercise 2

2.1 Create a training and test data set

Divide the sample into training and testing set using 70% for training the algorithm.

# Set the seed again because it sometime gets lost
set.seed(7)

# Split the data into training and test sets. 70% of the data will be used for training and 30% for testing.
balanced_data$Status <- as.factor(balanced_data$Status)
div <- createDataPartition(y = balanced_data$Status, p = 0.7, list = F)

# Training Sample
data.train <- balanced_data[div,] # 70% here

# Test Sample
data.test <- balanced_data[-div,] # rest of the 30% data goes here

2.2 Create a logistic regression model

Train the classifier and report the coefficients obtained and interpret the results.

# Set charachter variables which are not a factor as factor so the glm function can work with them
mutate(data.train, Status = as.factor(Status),
        application_type = as.factor(application_type),
        grade = as.factor(grade),
        home_ownership = as.factor(home_ownership),
        purpose = as.factor(purpose),
        verification_status = as.factor(verification_status))
## # A tibble: 7,196 × 17
##    loan_amnt int_rate grade home_ownership annual_inc verification_status
##        <dbl>    <dbl> <fct> <fct>               <dbl> <fct>              
##  1     24000     14.5 C     RENT               110000 Not Verified       
##  2     14400     21.5 D     RENT                65000 Source Verified    
##  3      9000     19.5 D     RENT                28000 Source Verified    
##  4      6075     25.5 D     MORTGAGE            75000 Verified           
##  5      4000     18.1 D     RENT                63000 Verified           
##  6     15475     18.2 D     RENT                43000 Verified           
##  7      8325     15.3 C     RENT                24000 Not Verified       
##  8     21600     23.6 D     RENT                48000 Verified           
##  9      5650     21.8 D     MORTGAGE            30000 Not Verified       
## 10      8000     15.6 C     MORTGAGE            59000 Source Verified    
## # ℹ 7,186 more rows
## # ℹ 11 more variables: purpose <fct>, dti <dbl>, open_acc <dbl>,
## #   revol_bal <dbl>, revol_util <dbl>, total_acc <dbl>, total_rec_int <dbl>,
## #   application_type <fct>, tot_cur_bal <dbl>, total_rev_hi_lim <dbl>,
## #   Status <fct>
model <- glm(Status ~ ., data = data.train, family = binomial())
summary(model)
## 
## Call:
## glm(formula = Status ~ ., family = binomial(), data = data.train)
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -2.006e+00  3.228e-01  -6.214 5.17e-10 ***
## loan_amnt                           5.015e-05  5.876e-06   8.535  < 2e-16 ***
## int_rate                            1.185e-01  1.535e-02   7.716 1.20e-14 ***
## gradeB                              2.675e-01  1.075e-01   2.489  0.01280 *  
## gradeC                              5.391e-01  1.381e-01   3.903 9.51e-05 ***
## gradeD                              5.432e-01  2.037e-01   2.667  0.00766 ** 
## home_ownershipOWN                  -5.281e-02  9.416e-02  -0.561  0.57491    
## home_ownershipRENT                  1.017e-01  6.852e-02   1.484  0.13789    
## annual_inc                         -4.900e-06  1.137e-06  -4.310 1.64e-05 ***
## verification_statusSource Verified  1.389e-01  6.051e-02   2.295  0.02172 *  
## verification_statusVerified         4.117e-02  6.964e-02   0.591  0.55435    
## purposecredit_card                 -1.065e-01  2.569e-01  -0.415  0.67843    
## purposedebt_consolidation          -2.075e-01  2.529e-01  -0.821  0.41192    
## purposehome_improvement            -1.690e-02  2.750e-01  -0.061  0.95101    
## purposehouse                       -4.822e-01  4.602e-01  -1.048  0.29471    
## purposemajor_purchase              -7.615e-02  3.075e-01  -0.248  0.80443    
## purposemedical                     -1.513e-01  3.279e-01  -0.461  0.64449    
## purposemoving                      -3.115e-01  3.928e-01  -0.793  0.42778    
## purposeother                       -2.814e-01  2.697e-01  -1.043  0.29688    
## purposerenewable_energy             3.663e-01  9.338e-01   0.392  0.69490    
## purposesmall_business               2.005e-01  3.536e-01   0.567  0.57065    
## purposevacation                     3.536e-01  3.831e-01   0.923  0.35602    
## purposewedding                     -1.148e-01  7.919e-01  -0.145  0.88471    
## dti                                 1.902e-02  3.627e-03   5.246 1.56e-07 ***
## open_acc                            4.344e-02  9.474e-03   4.586 4.52e-06 ***
## revol_bal                           9.199e-06  7.513e-06   1.224  0.22083    
## revol_util                         -1.733e-03  1.975e-03  -0.878  0.38013    
## total_acc                          -1.420e-02  3.865e-03  -3.673  0.00024 ***
## total_rec_int                      -2.210e-04  2.211e-05  -9.995  < 2e-16 ***
## application_typeJoint App           2.857e-01  2.238e-01   1.277  0.20165    
## tot_cur_bal                        -8.369e-07  3.754e-07  -2.229  0.02578 *  
## total_rev_hi_lim                   -1.236e-05  4.387e-06  -2.818  0.00484 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9975.8  on 7195  degrees of freedom
## Residual deviance: 8981.2  on 7164  degrees of freedom
## AIC: 9045.2
## 
## Number of Fisher Scoring iterations: 4
# Extract the significant variables from the model
significant.variables <- summary(model)$coeff[-1,4] < 0.05
names(significant.variables)[significant.variables == TRUE]
##  [1] "loan_amnt"                          "int_rate"                          
##  [3] "gradeB"                             "gradeC"                            
##  [5] "gradeD"                             "annual_inc"                        
##  [7] "verification_statusSource Verified" "dti"                               
##  [9] "open_acc"                           "total_acc"                         
## [11] "total_rec_int"                      "tot_cur_bal"                       
## [13] "total_rev_hi_lim"

Plot the ROC and the Precision/Recall Curve and interpret the results.

# Test the perfromance of the model with the test data
data.test$fit_score <- predict(model,type='response',data.test)
model_preddiction <- prediction(data.test$fit_score, data.test$Status)
model_roc <- performance(model_preddiction, "tpr", "fpr")

# Plot the ROC Curve
plot(model_roc, lwd=1, colorize = TRUE, main = "Model to predict Status - ROC Curve", 
  cex.main=2, 
  cex.axis=1.7, 
  cex.lab=2)
lines(x=c(0, 1), y=c(0, 1), col="black", lwd=1, lty=3)